Introduction
# load in the libraries
library(stringi)
library(plotly)
Loading required package: ggplot2
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
library(plyr)
Attaching package: ‘plyr’
The following objects are masked from ‘package:plotly’:
arrange, mutate, rename, summarise
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
[30m── [1mAttaching packages[22m ──────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──[39m
[30m[32m✓[30m [34mtibble [30m 2.1.3 [32m✓[30m [34mdplyr [30m 0.8.3
[32m✓[30m [34mtidyr [30m 1.0.0 [32m✓[30m [34mstringr[30m 1.4.0
[32m✓[30m [34mreadr [30m 1.3.1 [32m✓[30m [34mforcats[30m 0.4.0
[32m✓[30m [34mpurrr [30m 0.3.3 [39m
[30m── [1mConflicts[22m ─────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31mx[30m [34mdplyr[30m::[32marrange()[30m masks [34mplyr[30m::arrange(), [34mplotly[30m::arrange()
[31mx[30m [34mpurrr[30m::[32mcompact()[30m masks [34mplyr[30m::compact()
[31mx[30m [34mdplyr[30m::[32mcount()[30m masks [34mplyr[30m::count()
[31mx[30m [34mdplyr[30m::[32mfailwith()[30m masks [34mplyr[30m::failwith()
[31mx[30m [34mdplyr[30m::[32mfilter()[30m masks [34mplotly[30m::filter(), [34mstats[30m::filter()
[31mx[30m [34mdplyr[30m::[32mid()[30m masks [34mplyr[30m::id()
[31mx[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31mx[30m [34mdplyr[30m::[32mmutate()[30m masks [34mplyr[30m::mutate(), [34mplotly[30m::mutate()
[31mx[30m [34mdplyr[30m::[32mrename()[30m masks [34mplyr[30m::rename(), [34mplotly[30m::rename()
[31mx[30m [34mdplyr[30m::[32msummarise()[30m masks [34mplyr[30m::summarise(), [34mplotly[30m::summarise()
[31mx[30m [34mdplyr[30m::[32msummarize()[30m masks [34mplyr[30m::summarize()[39m
Analysis
Read in the data
Read in data from files
beer = read.csv("./data/Beers.csv",header = TRUE)
breweries = read.csv("./data/Breweries.csv",header = TRUE, strip.white = TRUE)
#display the dataframes
beer
breweries
NA
How many breweries are there in each state?
Heat Map of Breweies per State
## add lowercase state name for heat map
lstates = tolower(state.name)
state_count = state_breweies %>%
add_rownames("region") %>%
mutate( region=lstates[match(State, state.abb)] )
Deprecated, use tibble::rownames_to_column() instead.
## find center of each state for text position
snames <- data.frame(region=lstates, long=state.center$x, lat=state.center$y)
snames <- merge(snames, state_count, by="region")
## merge map data with counts data
choro <- left_join(
map_data("state"),
state_count
)
Joining, by = "region"
ggplot(choro, aes(long, lat)) +
geom_polygon(aes(group = group, fill = n)) +
geom_text(data=snames, aes(long, lat, label=n)) +
coord_quickmap()

NA
NA
The range of breweries per state ranges from 1 to 47, with Colorado holding the highest number of breweries per state. The heat map above shows North and South Dakotas along with West Virginia as having the lowest number of breweries (1 per state). Following Colorado is California with 39 and Oregon with 29 breweries. There appears to be higher number of breweries in the north east and west coast, as opposed to the central states, with the exception of Colorado and Texas.
Cleaning the data
Prepering/transforming the data into a usable form for analysis, visualization, etc… #### Merging Dataframes Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file.
attach(beer)
beer[order(Brewery_id),] # sort the data to determine column for merge
# merge on Brewery ID
breweries_named <- plyr::rename(breweries, c("Brew_ID"="Brewery_id"))
brewing_beer <- merge(breweries_named,beer,by="Brewery_id", all=TRUE) # outter join
brewed_beer <- plyr::rename(brewing_beer, c("Name.x"="Brewery", "Name.y"="Beer")) # rename breweries and beer
head(brewed_beer,6) # show the first 6 rows of data
NA
Missing Data
Missing data are in columns ABV (62) and IBU (1005) only. Cleaning data in multiple options: 1. complete records only 2. replacing NA with the averages of the remainder of the column
colSums(is.na(averaged_beer))
Brewery_id Brewery City State Beer Beer_ID ABV IBU Style Ounces ABVpercent
0 0 0 0 0 0 0 0 0 0 0
Median Alcohol Content
Median of Alcohol by Volume and Bitterness by State
# group by state, get median of ABV, IBU
medians <- as.data.frame(aggregate(completed_beer[,c(7,8)], by=list(completed_beer$State), FUN=median))
median_df <- plyr::rename(medians, c("Group.1"="State")) # rename the column to State
median_graph <- median_df %>% ggplot(aes(x = ABV, y = IBU, color=State)) + geom_point() + ggtitle("Median Alcohol Content and Bitterness by State") # plot scatter plot
ABV_bar <-ggplot(data=median_df, aes(x = State, y = ABV, fill = State)) +
geom_bar(stat="identity", width = 0.75) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ggtitle("Median ABV by State")
ggplotly(ABV_bar)
IBU_bar <-ggplot(data=median_df, aes(x = State, y = IBU, fill = State)) +
geom_bar(stat="identity", width = 0.75) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ggtitle("Median IBU by State")
ggplotly(IBU_bar)
NA
Max ABV and IBU
Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?
# find the max ABV and IBU from each state
maximums <- as.data.frame(aggregate(completed_beer[,c(7,8)], by=list(completed_beer$State), FUN=max))
max_df <- plyr::rename(maximums, c("Group.1"="State")) # rename the column to State
#find the max ABV and IBU states
max_ABV <- max_df %>% filter(ABV == max(ABV))
max_IBU <- max_df %>% filter(IBU == max(IBU))
max_state <- rbind(max_ABV, max_IBU)
max_state
NA
Summary Statistics
The summary statistics and distribution of the ABV variable.
#Creates column representing ABV in percentage which is more user-readable and in-line with how
#ABV is represented by the industry
print("Values below are in percentage (%)")
[1] "Values below are in percentage (%)"
#Using method 1 for handling missing data
completed_beer$ABVpercent <- completed_beer$ABV*100
summary(completed_beer$ABVpercent) #Range, quartiles, and mean of ABV percentage value
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.700 5.000 5.700 5.991 6.800 12.500
cat("Standard deviation: ", sd(completed_beer$ABVpercent)) #Standard deviation of ABV percentage value
Standard deviation: 1.357633
#Using method 2 for handling missing data
averaged_beer$ABVpercent <- averaged_beer$ABV*100
summary(averaged_beer$ABVpercent)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.100 5.000 5.700 5.977 6.700 12.800
cat("Standard deviation: ", sd(averaged_beer$ABVpercent))
Standard deviation: 1.336634
Relationship between Bitterness and Alcohol Content
Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and EXPLAIN your answer.
#Uses method 1 for handling missing data (remove incomplete lines)
#Correlation between ABV percentage and IBU; function uses Pearson method as default
cor(x=completed_beer$ABVpercent, y=completed_beer$IBU)
[1] 0.6706215
#Visualizations between ABV percentage and IBU
ggplot(data=completed_beer, mapping=aes(x= IBU, y= ABVpercent, position_jitter())) + geom_point()

ggplot(data=completed_beer, mapping=aes(x= ABVpercent, y= IBU, position_jitter())) + geom_point()

#Uses method 2 for handling missing data (replace NAs with average of present data)
#Correlation between ABV percentage and IBU; function uses Pearson method as default
cor(x=averaged_beer$ABVpercent, y=averaged_beer$IBU)
[1] 0.520011
#Visualizations between ABV percentage and IBU
ggplot(data=averaged_beer, mapping=aes(x= IBU, y= ABVpercent, position_jitter())) + geom_point()

ggplot(data=averaged_beer, mapping=aes(x= ABVpercent, y= IBU, position_jitter())) + geom_point()

IPAs vs. Ales
Budweiser would also like to investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA). You decide to use KNN classification to investigate this relationship. Provide statistical evidence one way or the other. You can of course assume your audience is comfortable with percentages … KNN is very easy to understand conceptually.
In addition, while you have decided to use KNN to investigate this relationship (KNN is required) you may also feel free to supplement your response to this question with any other methods or techniques you have learned. Creativity and alternative solutions are always encouraged.
ipa = brewed_beer[grep("IPA", brewed_beer$Style), ]
not_ipa = brewed_beer[-grep("IPA", brewed_beer$Style), ]
ales = not_ipa[grep("Ale", not_ipa$Style), ]
Additional inferences
Knock their socks off! Find one other useful inference from the data that you feel Budweiser may be able to find value in. You must convince them why it is important and back up your conviction with appropriate statistical evidence.
---
title: "Brewing Up a Storm"
output: html_notebook
editor_options: 
  chunk_output_type: console
---
### Introduction

```{r}
# load in the libraries
library(stringi)
library(plotly)
library(plyr)
library(tidyverse)
```

## Analysis

### Read in the data
Read in data from files
```{r}
beer = read.csv("./data/Beers.csv",header = TRUE)
breweries = read.csv("./data/Breweries.csv",header = TRUE, strip.white = TRUE)

#display the dataframes
beer
breweries

```


### How many breweries are there in each state?

```{r}

str(breweries) # check that State is a Factor

state_breweies <- breweries %>% group_by(State) %>% tally() # count the number of breweries within a state
unique_state_breweies <- breweries %>% group_by(State) %>% tally(n_distinct(Name)) # check for any duplicates

```

#### Heat Map of Breweies per State

```{r}
## add lowercase state name for heat map
lstates = tolower(state.name)
state_count = state_breweies %>% 
    add_rownames("region") %>% 
    mutate( region=lstates[match(State, state.abb)] )

## find center of each state for text position 
snames <- data.frame(region=lstates, long=state.center$x, lat=state.center$y)
snames <- merge(snames, state_count, by="region")

## merge map data with counts data
choro <- left_join(
  map_data("state"), 
  state_count
)
ggplot(choro, aes(long, lat)) +
  geom_polygon(aes(group = group, fill = n)) + 
  geom_text(data=snames, aes(long, lat, label=n)) +
  coord_quickmap()


```
The range of breweries per state ranges from 1 to 47, with Colorado holding the highest number of breweries per state. The heat map above shows North and South Dakotas along with West Virginia as having the lowest number of breweries (1 per state). Following Colorado is California with 39 and Oregon with 29 breweries. There appears to be higher number of breweries in the north east and west coast, as opposed to the central states, with the exception of Colorado and Texas. 

### Cleaning the data
Prepering/transforming the data into a usable form for analysis, visualization, etc... 
#### Merging Dataframes
Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the merged file.  
```{r}

attach(beer)
beer[order(Brewery_id),] # sort the data to determine column for merge

# merge on Brewery ID
breweries_named <- plyr::rename(breweries, c("Brew_ID"="Brewery_id"))

brewing_beer <- merge(breweries_named,beer,by="Brewery_id", all=TRUE) # outter join

brewed_beer <- plyr::rename(brewing_beer, c("Name.x"="Brewery", "Name.y"="Beer")) # rename breweries and beer

head(brewed_beer,6) # show the first 6 rows of data

```

#### Missing Data
Missing data are in columns ABV (62) and IBU (1005) only. 
Cleaning data in multiple options:
1. complete records only
2. replacing NA with the averages of the remainder of the column

```{r}
# selecting only complete cases
which(is.na(brewed_beer)) # determine which rows contain NA
colSums(is.na(brewed_beer)) # summary of the number of NA in each column
completed_beer <- brewed_beer[complete.cases(brewed_beer), ] # df with only complete records

# replacing NA with averages
averaged_beer <- brewed_beer # make a duplicate of the original df to manipulate
averageABV <- mean(averaged_beer$ABV, na.rm = TRUE) # set average variable for ABV
averageIBU <- mean(averaged_beer$IBU, na.rm = TRUE) # set average variable for IBU

#replace NA values with the average of the present values
averaged_beer$IBU[is.na(averaged_beer$IBU)]<-averageIBU
averaged_beer$ABV[is.na(averaged_beer$ABV)]<-averageABV


colSums(is.na(averaged_beer))

```

### Median Alcohol Content
Median of Alcohol by Volume and Bitterness by State
```{r}
# group by state, get median of ABV, IBU
medians <- as.data.frame(aggregate(completed_beer[,c(7,8)], by=list(completed_beer$State), FUN=median)) 
median_df <- plyr::rename(medians, c("Group.1"="State")) # rename the column to State

median_graph <- median_df %>% ggplot(aes(x = ABV, y = IBU, color=State)) + geom_point() + ggtitle("Median Alcohol Content and Bitterness by State") # plot scatter plot

ABV_bar <-ggplot(data=median_df, aes(x = State, y = ABV, fill = State)) +
  geom_bar(stat="identity", width = 0.75) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ggtitle("Median ABV by State")
ggplotly(ABV_bar)

IBU_bar <-ggplot(data=median_df, aes(x = State, y = IBU, fill = State)) +
  geom_bar(stat="identity", width = 0.75) + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ggtitle("Median IBU by State")
ggplotly(IBU_bar)

```
### Max ABV and IBU
Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?
```{r}
# find the max ABV and IBU from each state
maximums <- as.data.frame(aggregate(completed_beer[,c(7,8)], by=list(completed_beer$State), FUN=max))
max_df <- plyr::rename(maximums, c("Group.1"="State")) # rename the column to State

#find the max ABV and IBU states
max_ABV <- max_df %>% filter(ABV == max(ABV))
max_IBU <- max_df %>% filter(IBU == max(IBU))

max_state <- rbind(max_ABV, max_IBU)
max_state

```

### Summary Statistics
The summary statistics and distribution of the ABV variable.

```{r}
#Creates column representing ABV in percentage which is more user-readable and in-line with how
#ABV is represented by the industry
print("Values below are in percentage (%)")

#Using method 1 for handling missing data
completed_beer$ABVpercent <- completed_beer$ABV*100 

summary(completed_beer$ABVpercent) #Range, quartiles, and mean of ABV percentage value

cat("Standard deviation: ", sd(completed_beer$ABVpercent)) #Standard deviation of ABV percentage value

#Using method 2 for handling missing data
averaged_beer$ABVpercent <- averaged_beer$ABV*100

summary(averaged_beer$ABVpercent)

cat("Standard deviation: ", sd(averaged_beer$ABVpercent))


```

### Relationship between Bitterness and Alcohol Content
Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot.  Make your best judgment of a relationship and EXPLAIN your answer.
```{r}
#Uses method 1 for handling missing data (remove incomplete lines)

#Correlation between ABV percentage and IBU; function uses Pearson method as default
cor(x=completed_beer$ABVpercent, y=completed_beer$IBU)

#Visualizations between ABV percentage and IBU
ggplot(data=completed_beer, mapping=aes(x= IBU, y= ABVpercent, position_jitter())) + geom_point()

ggplot(data=completed_beer, mapping=aes(x= ABVpercent, y= IBU, position_jitter())) + geom_point()
```

```{r}
#Uses method 2 for handling missing data (replace NAs with average of present data)

#Correlation between ABV percentage and IBU; function uses Pearson method as default
cor(x=averaged_beer$ABVpercent, y=averaged_beer$IBU)

#Visualizations between ABV percentage and IBU
ggplot(data=averaged_beer, mapping=aes(x= IBU, y= ABVpercent, position_jitter())) + geom_point()

ggplot(data=averaged_beer, mapping=aes(x= ABVpercent, y= IBU, position_jitter())) + geom_point()
```

### IPAs vs. Ales
Budweiser would also like to investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA).  You decide to use KNN classification to investigate this relationship.  Provide statistical evidence one way or the other. You can of course assume your audience is comfortable with percentages … KNN is very easy to understand conceptually.

In addition, while you have decided to use KNN to investigate this relationship (KNN is required) you may also feel free to supplement your response to this question with any other methods or techniques you have learned.  Creativity and alternative solutions are always encouraged.  
```{r}
ipa = brewed_beer[grep("IPA", brewed_beer$Style), ]
not_ipa = brewed_beer[-grep("IPA", brewed_beer$Style), ]
ales = not_ipa[grep("Ale", not_ipa$Style), ]

```

### Additional inferences
Knock their socks off!  Find one other useful inference from the data that you feel Budweiser may be able to find value in.  You must convince them why it is important and back up your conviction with appropriate statistical evidence. 
```{r}

```










